rm(list=ls())
library(tidyverse)
setwd("~/Downloads/dataverse_files(3)")

getmode <- function(v, na.omit = TRUE){
  if (na.omit == TRUE) {
    v <- na.omit(v)  
  }
  uv <- unique(v)
  tab <- tabulate(match(v, uv))
  out <- uv[tab == max(tab)]
  if(length(out) > 1){
    out <- sample(out,1)
  }
  return(out)
}

# add custom functions to extract estimates (tidy) and goodness-of-fit (glance) information
tidy.feglm <- function(x, ...) {
  s <- summary(x,  type = "sandwich")
  ret <- data.frame(
    term      = row.names(s$cm),
    estimate  = s$cm[, 1],
    std.error  = s$cm[, 2]
  )
  ret
}

glance.feglm <- function(x, ...) {
  ret <- data.frame(
    Null.Dev = x$null.deviance,
    Deviance = x$deviance,
    N   = x$nobs[4],
    `Committee FE` = as.integer(length(x$nms.fe[[1]])),
    `Congress FE` = as.integer(length(x$nms.fe[[2]])))
  ret
}


hearings<- read_csv("Data/ThinkTankDocumentsCitations.csv")

hearings$tt_ideo <- fct_relevel(hearings$tt_ideo, "Right", "Left")

hearings$policy_document_id %>% unique() %>% length()

num_ref <- hearings$dois_cited %>% unique() %>% length()
#156809

num_ref_linked <- hearings$id %>% unique() %>% length()

max(hearings$published_on)
#"2022-11-21"
hearings$policy_document_id %>% unique() %>% length()
#210407
hearings$policy_source_id %>% unique() %>% length()
#121

hearings$policy_source_id[hearings$tt_ideo=="Left"] %>% unique() %>% length()
hearings$policy_source_id[hearings$tt_ideo=="Right"] %>% unique() %>% length()

## for SI 2

source_EIN <- read_csv("Data/Think Tank EINs.csv")
hearings_title <- hearings %>% left_join(dplyr::select(source_EIN, `Source ID`, Title), by =c("policy_source_id" = "Source ID"))
hearings_title %>% filter(tt_ideo=="Right") %>% dplyr::select(Title) %>% unique() %>% write_csv("Output/TableS6_1.csv")
hearings_title %>% filter(tt_ideo=="Left") %>% dplyr::select(Title) %>% unique() %>% write_csv("Output/TableS6_2.csv")

###### document level dataset


documents <- read_csv("Data/ThinkTankDocumentsAnalysisData.csv")
documents$tt_ideo <- fct_relevel(documents$tt_ideo, "Right", "Left")
documents2 <- documents %>% mutate(pub_year = as.numeric(as.factor(pub_year)))
documents2$tt_ideo <- fct_relevel(documents2$tt_ideo, "Right", "Left")

library(cowplot)



library(lme4)
doc_cites_model_time <- glmer(cites_science ~ tt_ideo*pub_year + (1|policy_source_id), data = documents2, family = binomial(link = "logit"))


library(sjPlot)
library(sjmisc)
library(ggplot2)
library(modelsummary)

time_pred_plot <- plot_model(doc_cites_model_time, type = "pred", terms = c("pub_year [all]", "tt_ideo")) + ylab("Probability that an ideological think tank document cites science") + xlab("Year") +
  ggtitle("") + scale_color_manual("", labels  = c("Right", "Left"), values=c("#9A3E25","#156B90")) + theme_minimal(base_size = 7) +
  theme(legend.position = "bottom") + scale_fill_manual("", labels  = c("Right", "Left"), values=c("#9A3E25","#156B90")) +
  scale_x_continuous("Year", breaks = seq(1,21, by = 5), labels = seq(1,21, by = 5)+1999) 


ggsave("Output/Fig1C.pdf", time_pred_plot, device = "pdf", width = 4, height = 4, units="in")

modelsummary(doc_cites_model_time, output = "Output/TableS14.docx")




#### Overall Difference in Citation to Science 
documents$tt_ideo <- fct_relevel(documents$tt_ideo, "Right", "Left")

doc_cites_model <- glmer(cites_science ~ tt_ideo  + (1|pub_year) + (1|policy_source_id) , data = documents,  family = binomial(link = "logit"))
#Figure 2H
library(broom.mixed)


modelsummary(doc_cites_model, output = "Output/TableS15.docx")



#By FOR

load("Data/ThinkTank_documents_FOR.RData")
FOR_models <- lapply(documents_FOR, function(i)  glmer(cites_science ~ tt_ideo  + (1|pub_year) + (1|policy_source_id), nAGQ = 0, data = i, family = binomial(link = "logit"))) 

FOR_results <- do.call(rbind, lapply(FOR_models, function(i) tidy(i))) %>% as_tibble() %>% filter(term == "tt_ideoLeft")
FORs <- lapply(documents_FOR, function(i) pull(i, FOR)[1]) %>%  unlist()

FOR_results$FOR_code <- FORs

for_labels <- read_csv("Data/Dimensions-Fields-of-Research-2020-10-28_14-28-57.csv") 
for_labels <- for_labels %>% dplyr::select(Name, `Fields of Research code`)
names(for_labels) <- c("FOR_text", "FOR_code")
for_labels$FOR_code <- as.numeric(for_labels$FOR_code)
FOR_results$FOR_code <- as.numeric(FOR_results$FOR_code)

FOR_results <- FOR_results %>% left_join(for_labels)


FOR_results$FOR_text <- as.character(FOR_results$FOR_text)
FOR_results$FOR_text[is.na(FOR_results$FOR_text)] <- "Unknown"
FOR_results <- FOR_results %>% mutate(FOR_text = str_replace_all(FOR_text, "and", "&"))


FOR_results$FOR_text <- fct_reorder(FOR_results$FOR_text, FOR_results$estimate, .fun =mean)

for_cc_plot <- ggplot(FOR_results, aes(x=FOR_text, group = FOR_text, y=estimate, ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error)) + 
  geom_point(color = "#156B90", size =3 ) +
  geom_errorbar(size=2, width=0,alpha=.65, colour = "#156B90") + 
  coord_flip() + theme_minimal(base_size = 7) + 
  geom_hline(yintercept = 0, linetype = 3) +
  ylab("Log odds") +xlab("")

ggsave("Output/FigS6C.pdf", for_cc_plot, device = "pdf", width = 8, height = 4, units="in")

FORs <- as_tibble(FORs)
names(FORs) <- c("FOR_code")

FORs <- FORs %>% left_join(for_labels)
FORs$FOR_text[is.na(FORs$FOR_text)] <- "Unknown"
FOR_results
names(FOR_models) <- FORs$FOR_text
modelsummary(FOR_models, output = "Output/TableS16-17.docx")
####Uses only year fixed effects

######## trend by hit paper


library(sjPlot)
library(sjmisc)
library(ggplot2)
library(modelsummary)
theme_set(theme_sjplot())

documents <- documents %>% mutate(pub_year = as.numeric(as.factor(pub_year)))

doc_hitcites_model_time <- glmer(cites_hitscience ~ tt_ideo*pub_year +(1|policy_source_id), data = documents, family = binomial(link = "logit"))

hit_time_pred_plot <- plot_model(doc_hitcites_model_time, type = "pred", terms = c("pub_year [all]", "tt_ideo")) + ylab("Probability that an ideological think tank document cites hit science") + xlab("Year") +
  ggtitle("Cites to hit (top 5%) scientific papers") + scale_color_manual("", labels  = c("Right", "Left"), values=c("#9A3E25","#156B90")) + theme(legend.position = "bottom")  +theme_minimal() +
  scale_x_continuous("Year", breaks = seq(1,21, by = 5), labels = seq(1,21, by = 5)+1999) +theme(legend.position = "bottom")


doc_nohitcites_model_time <- glmer(cites_nonhitscience ~ tt_ideo*pub_year + (1|policy_source_id), data = documents, family = binomial(link = "logit"))

nohit_time_pred_plot <- plot_model(doc_nohitcites_model_time, type = "pred", terms = c("pub_year [all]", "tt_ideo")) + ylab("Probability that an ideological think tank document cites non-hit science") + xlab("Year") +
  ggtitle("Cites to non-hit scientific papers") + scale_color_manual("", labels  = c("Right", "Left"), values=c("#9A3E25","#156B90"))  +theme_minimal() +
  scale_x_continuous("Year", breaks = seq(1,21, by = 5), labels = seq(1,21, by = 5)+1999) +theme(legend.position = "bottom")

library(cowplot)

tt_hit_time_pred_plot <- cowplot::plot_grid(hit_time_pred_plot, nohit_time_pred_plot, nrow = 1)

ggsave("Output/FigS8.pdf", tt_hit_time_pred_plot, device = "pdf", width = 8, height = 4, units="in")

modelsummary(list(doc_hitcites_model_time, doc_nohitcites_model_time), output = "HitThinkTankPartyTime.docx")




######### By Issue area
library(tidyverse)
classifications <- read_csv("Data/ThinkTankDocumentClassifications.csv")

documents <- read_csv("Data/ThinkTankDocumentsAnalysisData.csv")

documents <- documents %>% left_join(classifications)

issues <- documents$class1 %>% unique()

doc_issues <- documents %>% split(.$class1)


doc_issues <- lapply(doc_issues, function(i)  mutate(i, tt_ideo = fct_relevel(tt_ideo, "Right", "Left"))) 
#remove 'human interest' classification because there is only 1 associated document
doc_issues <- doc_issues[names(doc_issues) %in% "human interest" == FALSE]   
library(lme4)
library(broom.mixed)
issue_models <- lapply(doc_issues, function(i)  glmer(cites_science ~ tt_ideo  + (1|pub_year) + (1|policy_source_id), nAGQ = 0, data = i, family = binomial(link = "logit"))) 

issue_results <- do.call(rbind, lapply(issue_models, function(i) tidy(i))) %>% as_tibble() %>% filter(term == "tt_ideoLeft")


issue_results$issue <- names(doc_issues)

issue_results <- issue_results %>% 
  mutate(issue = str_replace_all(issue, "and", "&"))

issue_results$issue <- fct_reorder(issue_results$issue, issue_results$estimate, .fun =mean)

issue_tt_plot <- ggplot(issue_results, aes(x=issue, group = issue, y=estimate, ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error)) + 
  geom_point(color = "#156B90", size =3 ) +
  geom_errorbar(size=2, width=0,alpha=.65, colour = "#156B90") + 
  coord_flip() + theme_minimal(base_size = 7) + 
  geom_hline(yintercept = 0, linetype = 3) +
  ylab("Log odds") +xlab("")

ggsave("Output/FigS6D.pdf", issue_tt_plot, device = "pdf", width = 6, height = 4, units="in")

library(modelsummary)

library(cowplot)


issue_models <- as.list(issue_models)

names(issue_models) <- names(doc_issues)

modelsummary(issue_models, output = "Output/TableS18.docx")

###### Overlap



hearings <- hearings %>% left_join(classifications)



cited_papers_FOR <- read_csv("Data/ThinkTankPaperFOR.csv") %>% dplyr::select(-`...1`) %>% filter(for_field_id < 100) %>% unique()


hearings_FOR <- hearings %>% left_join(cited_papers_FOR, by = c("id" = "paper_id")) %>% unique()
hearings_FOR <- hearings_FOR %>% rename(FOR_code = for_field_id)
for_labels <- read_csv("Data/Dimensions-Fields-of-Research-2020-10-28_14-28-57.csv") 
for_labels <- for_labels %>% dplyr::select(Name, `Fields of Research code`)
names(for_labels) <- c("FOR_text", "FOR_code")
for_labels$FOR_code <- as.numeric(for_labels$FOR_code)
hearings_FOR$FOR_code <- as.numeric(hearings_FOR$FOR_code)

hearings_FOR <- hearings_FOR %>% left_join(for_labels)
hearings_FOR
table(hearings_FOR$class1, hearings_FOR$FOR_text)

demrep <- hearings %>% filter(!is.na(dois_cited)) %>% group_by(dois_cited) %>% 
  summarise(Left = sum(Left, na.rm =T), Right = sum(Right,na.rm =T)) %>% 
  mutate(cite_type = case_when(Left > 0 & Right > 0 ~ "Both",
                                Right > 0 ~ "R", 
                                 Left > 0 ~ "L")) %>% 
  dplyr::select(-Left, -Right)

hearings_FOR <- hearings_FOR %>% left_join(demrep)

overlap_FOR <- hearings_FOR %>% filter(!is.na(FOR_code)) %>% dplyr::select(dois_cited, cite_type, FOR_text) %>% group_by(FOR_text) %>% 
  summarise(both = length(unique(dois_cited[cite_type == "Both"])),
            D = length(unique(dois_cited[cite_type == "L"])),
            R = length(unique(dois_cited[cite_type == "R"])),
            All = length(unique(dois_cited)),
            both_share = both/All, 
            D_share = D/All, 
            R_share = R/All)

overlap_class <- hearings_FOR %>% filter(!is.na(FOR_code)) %>% dplyr::select(dois_cited, cite_type, class1) %>% group_by(class1) %>% 
  summarise(both = length(unique(dois_cited[cite_type == "Both"])),
            D = length(unique(dois_cited[cite_type == "L"])),
            R = length(unique(dois_cited[cite_type == "R"])),
            All = length(unique(dois_cited)),
            both_share = both/All, 
            D_share = D/All, 
            R_share = R/All)


overlap_class <- overlap_class %>% mutate(class1 = fct_reorder(class1, both_share)) %>% filter(!is.na(class1))
class_both <- overlap_class %>% ggplot(aes(x=both_share, y = class1)) + geom_point() + theme_minimal() + scale_x_continuous(label = scales::percent) +xlab("Share of science cited by both R and L") + ylab("")


overlap_FOR <- overlap_FOR %>% mutate(FOR_text = fct_reorder(FOR_text, both_share)) %>% filter(!is.na(FOR_text))
FOR_both <- overlap_FOR %>% ggplot(aes(x=both_share, y = FOR_text)) + geom_point() + theme_minimal() + scale_x_continuous(label = scales::percent) +xlab("Share of science cited by both R and L") + ylab("")

library(cowplot)

for_class_overlap <- cowplot::plot_grid(FOR_both, class_both, nrow = 1)


ggsave("Output/FigS18.pdf", for_class_overlap, width = 10, height = 4)


######### time series overlap


demrep <- hearings %>% filter(!is.na(dois_cited)) %>% group_by(dois_cited) %>% summarise(Left = sum(Left, na.rm =T), Right = sum(Right,na.rm =T)) %>% mutate(cite_type = case_when(Left > 0 & Right > 0 ~ "Both",
                                                                                                                                                                                   Right > 0 ~ "R", 
                                                                                                                                                                                   Left > 0 ~ "L")) %>% dplyr::select(-Left, -Right)
hearings <- hearings %>% left_join(demrep)

hearings <- hearings %>% dplyr::select(-class1) %>% unique()
overlap <- hearings %>% summarise(both = length(unique(dois_cited[cite_type == "Both"])),
                                  L = length(unique(dois_cited[cite_type == "L"])),
                                  R = length(unique(dois_cited[cite_type == "R"])),
                                  All = length(unique(dois_cited)),
                                  both_share = both/All, 
                                  L_share = L/All, 
                                  R_share = R/All)



overlap <- overlap %>% dplyr::select(both_share, L_share, R_share) %>% pivot_longer(cols = 1:3, names_to = "type", values_to = "val")

overlap_time  <- hearings %>% group_by(pub_year) %>% summarise(both = length(unique(dois_cited[cite_type == "Both"])),
                                                               L = length(unique(dois_cited[cite_type == "L"])),
                                                               R = length(unique(dois_cited[cite_type == "R"])),
                                                               All = length(unique(dois_cited)),
                                                               both_share = both/All, 
                                                               L_share = L/All, 
                                                               R_share = R/All)


overlap_p <- overlap %>% ggplot(aes(x=type, y=val, fill = type)) + geom_bar(stat="identity") + theme_classic() + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_discrete(labels = c("Both", "Left only", "Right only")) + 
  scale_fill_manual("", values=c("#666666", "#156B90","#9A3E25")) + 
  ylab("Percent of cited science") + xlab("") + theme(legend.position = "none")


overlap_time_p <- overlap_time %>% ggplot(aes(x=pub_year, y=both_share)) + geom_line() + theme_minimal() +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) + ylab("Percent of cited science") + xlab("Year") 


library(cowplot)

p_grid <- cowplot::plot_grid(overlap_p, overlap_time_p)

ggsave("Output/FigS14.pdf", p_grid, width = 11, height = 5)

